Authors: Mauro Venticinque, Angelo Schillaci, Daniele Tambone

GitHub project: Bank-Marketing

Date: 2025-03-27

Introduction

Here we will write some information about the project.

1 Exploratory Data Analysis

datatable(head(train, 100), options = list(scrollX = TRUE))
str(train)
## 'data.frame':    32950 obs. of  22 variables:
##  $ X             : int  35248 39854 14530 27822 40199 21227 16836 39099 38565 38152 ...
##  $ age           : int  30 39 43 27 56 41 57 46 61 35 ...
##  $ job           : chr  "blue-collar" "technician" "services" "student" ...
##  $ marital       : chr  "married" "married" "single" "single" ...
##  $ education     : chr  "professional.course" "university.degree" "high.school" "high.school" ...
##  $ default       : chr  "no" "no" "no" "no" ...
##  $ housing       : chr  "no" "yes" "no" "yes" ...
##  $ loan          : chr  "no" "no" "no" "no" ...
##  $ contact       : chr  "cellular" "cellular" "cellular" "cellular" ...
##  $ month         : chr  "may" "jun" "jul" "mar" ...
##  $ day_of_week   : chr  "fri" "mon" "tue" "thu" ...
##  $ duration      : int  1357 713 1317 80 230 697 1441 679 106 234 ...
##  $ campaign      : int  4 2 4 4 2 2 2 1 2 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  1 0 0 0 1 0 0 0 1 0 ...
##  $ poutcome      : chr  "failure" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num  -1.8 -1.7 1.4 -1.8 -1.7 1.4 1.4 -3 -3.4 -3.4 ...
##  $ cons.price.idx: num  92.9 94.1 93.9 92.8 94.2 ...
##  $ cons.conf.idx : num  -46.2 -39.8 -42.7 -50 -40.3 -36.1 -42.7 -33 -26.9 -29.8 ...
##  $ euribor3m     : num  1.25 0.72 4.96 1.65 0.87 ...
##  $ nr.employed   : num  5099 4992 5228 5099 4992 ...
##  $ subscribed    : chr  "yes" "yes" "yes" "yes" ...
attach(train)

1.1 Variable descriptions

1.1.1 Bank client data:

  1. X (Integer): id of customer
  2. age (Integer): age of the customer
  3. job (Categorical): occupation
  4. marital (Categorical): marital status
  5. education (Categorical): education level
  6. default (Binary): has credit in default?
  7. housing (Binary): has housing loan?
  8. loan (Binary): has personal loan?
  9. contact (Categorical): contact communication type
  10. month (Categorical): last contact month of year
  11. day_of_week (Integer): last contact day of the week
  12. duration (Integer): last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y=‘no’). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model

1.1.2 Other attributes:

  1. campaign (Integer): number of contacts performed during this campaign and for this client (numeric, includes last contact)
  2. pdays (Integer): number of days that passed by after the client was last contacted from a previous campaign (numeric; -1 means client was not previously contacted)
  3. previous (Integer): number of contacts performed before this campaign and for this client
  4. poutcome (Categorical): outcome of the previous marketing campaign (categorical: ‘failure’,‘nonexistent’,‘success’)

1.1.3 Social and economic context attributes

  1. emp.var.rate (Integer): employment variation rate - quarterly indicator
  2. cons.price.idx (Integer): consumer price index - monthly indicator
  3. cons.conf.idx (Integer): consumer confidence index - monthly indicator
  4. euribor3m (Integer): euribor 3 month rate - daily indicator
  5. nr.employed (Integer): number of employees - quarterly indicator

1.1.4 Output variable (desired target)

  1. subscribed (Binary): has the client subscribed a term deposit?

Source: UCI Machine Learning Repository

vis_dat(train)

skim(train)
Data summary
Name train
Number of rows 32950
Number of columns 22
_______________________
Column type frequency:
character 11
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
job 0 1 6 13 0 12 0
marital 0 1 6 8 0 4 0
education 0 1 7 19 0 8 0
default 0 1 2 7 0 3 0
housing 0 1 2 7 0 3 0
loan 0 1 2 7 0 3 0
contact 0 1 8 9 0 2 0
month 0 1 3 3 0 10 0
day_of_week 0 1 3 3 0 5 0
poutcome 0 1 7 11 0 3 0
subscribed 0 1 2 3 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
X 0 1 20622.42 11882.00 1.00 10346.50 20629.50 30883.75 41188.00 ▇▇▇▇▇
age 0 1 40.04 10.45 17.00 32.00 38.00 47.00 98.00 ▅▇▃▁▁
duration 0 1 258.66 260.83 0.00 102.00 180.00 318.00 4918.00 ▇▁▁▁▁
campaign 0 1 2.57 2.77 1.00 1.00 2.00 3.00 43.00 ▇▁▁▁▁
pdays 0 1 961.90 188.33 0.00 999.00 999.00 999.00 999.00 ▁▁▁▁▇
previous 0 1 0.17 0.49 0.00 0.00 0.00 0.00 7.00 ▇▁▁▁▁
emp.var.rate 0 1 0.08 1.57 -3.40 -1.80 1.10 1.40 1.40 ▁▃▁▁▇
cons.price.idx 0 1 93.57 0.58 92.20 93.08 93.75 93.99 94.77 ▁▆▃▇▂
cons.conf.idx 0 1 -40.49 4.63 -50.80 -42.70 -41.80 -36.40 -26.90 ▅▇▁▇▁
euribor3m 0 1 3.62 1.74 0.63 1.34 4.86 4.96 5.04 ▅▁▁▁▇
nr.employed 0 1 5167.01 72.31 4963.60 5099.10 5191.00 5228.10 5228.10 ▁▁▃▁▇
plot_ly(train, x = subscribed, type = 'histogram')
corrplot(cor(train[, c("X", "age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed")]), method="pie")

plot_ly(train, x = job, y = age, type = 'box', color = job)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
plot_ly(train, x = education, y = age, type = 'box', color = education)
eduResp <- ggplot(train, aes(x = education, fill = factor(subscribed))) +
  geom_bar(position = "fill") +
  coord_flip() +
  ylab("Proportion") +
  scale_fill_discrete(name = "Subscribed") +
  xlab("Education") +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

eduFreq <- ggplot(as.data.frame(table(education)/sum(table(education))*100), aes(x = reorder(education, Freq), y = Freq)) +
  geom_bar(stat = "identity", color = "gray",  fill = "steelblue", alpha=0.9) +  
  coord_flip() +
  labs(title = "Education", x = "Education Level", y = "Count") +
  theme_minimal()

eduFreq / eduResp

ggplot(train, aes(x = poutcome, fill = factor(subscribed))) +
  geom_bar(position = "fill") +
  ylab("Proportion") +
  scale_fill_discrete(name = "Subscribed") +
  xlab("Outcome of previous campaign") +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
  theme_minimal() +
  geom_text(
    data = transform(as.data.frame(table(train$poutcome)),
                     poutcome = Var1,
                     label = paste0(round(100 * Freq / sum(Freq), 1), "%")),
    aes(x = poutcome, y = 1.05, label = label),
    inherit.aes = FALSE
  )

ggplot(train, aes(age)) + geom_histogram(binwidth=4,position="fill",aes(fill=factor(subscribed)))+scale_fill_discrete(name="Subscribed")+ylab("proportion")+geom_hline(yintercept=0.5)

ggplot(train, aes(x = job, fill = factor(subscribed))) +
  geom_bar(position = "fill") +
  ylab("Proportion") +
  scale_fill_discrete(name = "Subscribed") +
  xlab("Job") +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  geom_text(
    data = transform(as.data.frame(table(train$job)),
                     job = Var1,
                     label = paste0(round(100 * Freq / sum(Freq), 1), "%")),
    aes(x = job, y = 1.05, label = label),
    inherit.aes = FALSE
  )

ggplot(train, aes(cons.price.idx)) + geom_histogram(binwidth=2,position="fill",aes(fill=factor(subscribed)))+scale_fill_discrete(name="Subscribed")+ylab("proportion")+geom_hline(yintercept=0.5)

ggplot(train, aes(cons.conf.idx)) + geom_histogram(binwidth=3,position="fill",aes(fill=factor(subscribed)))+scale_fill_discrete(name="Subscribed")+ylab("proportion")+geom_hline(yintercept=0.5)

ggplot(train, aes(euribor3m)) + geom_histogram(binwidth=3,position="fill",aes(fill=factor(subscribed)))+scale_fill_discrete(name="Subscribed")+ylab("proportion")+geom_hline(yintercept=0.5)

train$emp_cat <- ifelse(train$emp.var.rate < 0, "Negative", "Positive or Zero")

ggplot(train, aes(x = emp_cat, fill = factor(subscribed))) +
  geom_bar(position = "fill") +
  ylab("Proportion") +
  scale_fill_discrete(name = "Subscribed") +
  xlab("Employment Variation (±)") +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
  theme_minimal() +
  geom_text(
    data = {
      df <- as.data.frame(table(train$emp_cat))
      names(df) <- c("emp_cat", "Freq")
      df$label <- paste0(round(100 * df$Freq / sum(df$Freq), 1), "%")
      df$y <- 1.05
      df
    },
    aes(x = emp_cat, y = y, label = label),
    inherit.aes = FALSE
  )

ggplot(train, aes(x = default, fill = factor(subscribed))) +
  geom_bar(position = "fill") +
  ylab("Proportion") +
  scale_fill_discrete(name = "Subscribed") +
  xlab("Default") +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
  theme_minimal() +
  geom_text(
    data = transform(as.data.frame(table(train$default)),
                     default = Var1,
                     label = paste0(round(100 * Freq / sum(Freq), 1), "%")),
    aes(x = default, y = 1.05, label = label),
    inherit.aes = FALSE
  )

ggpairs(train[, c("age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed")], columns = 1:10, 
                 lower = list(continuous = wrap("points", alpha = 0.5, color = "darkred", size=0.5)),
                 title='Scatterplot', axisLabels='none')